* 5: Estimates probability of sc/st/obc given last name
* Table A1, A2
gl data ".../Replication Files/data"



* Import and Append Raw Election Data
{
set more off

foreach cand in Sarpanch Member {
foreach date in Dec_2011 April_2012 July_2012 {
import excel "$data/GP_`cand'.xlsx", sheet("`date'") firstrow clear
if "`cand'"!="Sarpanch" | "`date'"!="Dec_2011" {
append using "$data/elecdata.dta"
}
save "$data/elecdata.dta", replace
}
}
}
*



* Last Names for Merging with Election Data (used in Table 5)
{
use "$data/elecdata.dta", clear

ren Name Name
replace Name = upper(Name)
split Name

gen lastname = Name6
forv n = 5(-1)1 {
replace lastname = Name`n' if lastname==""
}
replace lastname = subinstr(lastname," ","",.)
replace lastname = subinstr(lastname,")","",.)
replace lastname = subinstr(lastname,"(","",.)
replace lastname = subinstr(lastname,",","",.)
replace lastname = subinstr(lastname,".","",.)
replace lastname = subinstr(lastname,";","",.)
replace lastname = subinstr(lastname,":","",.)
replace lastname = subinstr(lastname,"/","",.)
drop if lastname==""

egen seattype = group(SeatType)
replace SeatType="SC" if seattype==3 | seattype==4
replace SeatType="ST" if seattype==1 | seattype==2
replace SeatType="OBC" if seattype==5 | seattype==6
replace SeatType="GEN" if seattype==7 | seattype==8

gen ones = 1
collapse (sum) ones, by(SeatType lastname)

egen seat = group(SeatType)
* tab seat SeatType: 1 GEN 2 OBC 3 SC 4 ST
drop SeatType
reshape wide ones, i(lastname) j(seat)

forv i = 1/4 {
replace ones`i' = 0 if ones`i'==.
}
gen ones = ones1 + ones2 + ones3 + ones4
gsort -ones

egen lastname_id = group(lastname)
order lastname_id lastname

collapse (sum) ones*, by(lastname)


/* 
Pr(SC|Name) = Pr(SC and Name) / Pr(Name)

Numerator_SC → Pr(SC and Name and Politician)/Pr(SC and Politician) = #(SC Politicians with Name)/[#(SC Politicians)/#(SC Population)]

Denominator → Numerator_SC + Numerator_ST + Numerator_OBC + Numerator_GEN

#(GEN Politicians with Name) → ones1
GEN share of rural pop → .3721

#(OBC Politicians with Name) → ones2
OBC share of rural pop → .3251

#(SC Politicians with Name) → ones3
SC share of rural pop → .1091

#(ST Politicians with Name) → ones4
ST share of rural pop → .1937
*/

gen denom = (ones1/152298)*.3721 + (ones2/24007)*.3251 + (ones3/16518)*.1091 + (ones4/58634)*.1937
gen prob_gen_reweight = ((ones1/152298)*.3721)/denom
gen prob_obc_reweight = ((ones2/24007)*.3251)/denom
gen prob_sc_reweight = ((ones3/16518)*.1091)/denom
gen prob_st_reweight = ((ones4/58634)*.1937)/denom

forv i = 1/4 {
replace ones`i' = ones`i'/ones
}

drop ones
ren ones1 prob_gen
ren ones2 prob_obc
ren ones3 prob_sc
ren ones4 prob_st

drop denom
}
save "$data/lastnames.dta", replace
*



* Table A.1: Common Last Names 
{
use "$data/elecdata.dta", clear

ren Name Name
replace Name = upper(Name)
split Name

gen lastname = Name6
forv n = 5(-1)1 {
replace lastname = Name`n' if lastname==""
}
replace lastname = subinstr(lastname," ","",.)
replace lastname = subinstr(lastname,")","",.)
replace lastname = subinstr(lastname,"(","",.)
replace lastname = subinstr(lastname,",","",.)
replace lastname = subinstr(lastname,".","",.)
replace lastname = subinstr(lastname,";","",.)
replace lastname = subinstr(lastname,":","",.)
replace lastname = subinstr(lastname,"/","",.)
drop if lastname==""

egen seattype = group(SeatType)
replace SeatType="ST" if seattype==1 | seattype==2
replace SeatType="SC" if seattype==3 | seattype==4
replace SeatType="OBC" if seattype==5 | seattype==6
replace SeatType="GEN" if seattype==7 | seattype==8

gen ones = 1
collapse (sum) ones, by(SeatType lastname)

egen seat = group(SeatType)
* tab seat SeatType: 1 GEN 2 OBC 3 SC 4 ST
drop SeatType
reshape wide ones, i(lastname) j(seat)

forv i = 1/4 {
replace ones`i' = 0 if ones`i'==.
}
gen ones = ones1 + ones2 + ones3 + ones4
gsort -ones

egen lastname_id = group(lastname)
order lastname_id lastname
br


* Correct for Script Differences
replace lastname="પટેલ" if lastname_id==13696 | lastname_id==26315 | lastname_id==1636
replace lastname="પરમાર" if lastname_id==26411
replace lastname="ચૌઘરી" if lastname_id==8752

collapse (sum) ones*, by(lastname)

forv i = 1/4 {
replace ones`i' = ones`i'/ones
}


qui sum ones
gen prop = ones/r(sum)
* Order: GEN, OBC, SC, ST
gsort -ones

br if _n<=10
}
*



* Table A.2: Common Last Names Reweighted
{
use "$data/elecdata.dta", clear

ren Name Name
replace Name = upper(Name)
split Name

gen lastname = Name6
forv n = 5(-1)1 {
replace lastname = Name`n' if lastname==""
}
replace lastname = subinstr(lastname," ","",.)
replace lastname = subinstr(lastname,")","",.)
replace lastname = subinstr(lastname,"(","",.)
replace lastname = subinstr(lastname,",","",.)
replace lastname = subinstr(lastname,".","",.)
replace lastname = subinstr(lastname,";","",.)
replace lastname = subinstr(lastname,":","",.)
replace lastname = subinstr(lastname,"/","",.)
drop if lastname==""

egen seattype = group(SeatType)
replace SeatType="SC" if seattype==3 | seattype==4
replace SeatType="ST" if seattype==1 | seattype==2
replace SeatType="OBC" if seattype==5 | seattype==6
replace SeatType="GEN" if seattype==7 | seattype==8

gen ones = 1
collapse (sum) ones, by(SeatType lastname)

egen seat = group(SeatType)
* tab seat SeatType: 1 GEN 2 OBC 3 SC 4 ST
drop SeatType
reshape wide ones, i(lastname) j(seat)

forv i = 1/4 {
replace ones`i' = 0 if ones`i'==.
}
gen ones = ones1 + ones2 + ones3 + ones4
gsort -ones

egen lastname_id = group(lastname)
order lastname_id lastname

* script differences
replace lastname="પટેલ" if lastname_id==13696 | lastname_id==26315 | lastname_id==1636
replace lastname="પરમાર" if lastname_id==26411
replace lastname="ચૌઘરી" if lastname_id==8752


collapse (sum) ones*, by(lastname)

gen denom = (ones1/152298)*.3721 + (ones2/24007)*.3251 + (ones3/16518)*.1091 + (ones4/58634)*.1937
gen prob_gen_reweight = ((ones1/152298)*.3721)/denom
gen prob_obc_reweight = ((ones2/24007)*.3251)/denom
gen prob_sc_reweight = ((ones3/16518)*.1091)/denom
gen prob_st_reweight = ((ones4/58634)*.1937)/denom

forv i = 1/4 {
replace ones`i' = ones`i'/ones
}

ren ones1 prob_gen
ren ones2 prob_obc
ren ones3 prob_sc
ren ones4 prob_st

gsort -ones
br lastname *_reweight if _n<=10
}
*

